import collections
import re
import urllib.parse
import uuid

from lxml import etree

from ebook_converter import constants as const
from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks import ConversionError


def XPath(x):
    try:
        return etree.XPath(x, namespaces=const.XPNSMAP)
    except etree.XPathSyntaxError:
        raise ConversionError('The syntax of the XPath expression %s is '
                              'invalid.' % repr(x))


def isspace(x):
    return not x or x.replace('\xa0', '').isspace()


def at_start(elem):
    ' Return True if there is no content before elem '
    body = XPath('ancestor-or-self::h:body')(elem)
    if not body:
        return True
    body = body[0]
    ancestors = frozenset(XPath('ancestor::*')(elem))
    for x in body.iter():
        if x is elem:
            return True
        if hasattr(getattr(x, 'tag', None),
                   'rpartition') and x.tag.rpartition('}')[-1] in {'img',
                                                                   'svg'}:
            return False
        if isspace(getattr(x, 'text', None)) and (x in ancestors or
                                                  isspace(getattr(x, 'tail',
                                                                  None))):
            continue
        return False
    return False


class DetectStructure(object):

    def __call__(self, oeb, opts):
        self.log = oeb.log
        self.oeb = oeb
        self.opts = opts
        self.log.info('Detecting structure...')

        self.detect_chapters()
        if self.oeb.auto_generated_toc or opts.use_auto_toc:
            orig_toc = self.oeb.toc
            self.oeb.toc = base.TOC()
            self.create_level_based_toc()
            if self.oeb.toc.count() < 1:
                if not opts.no_chapters_in_toc and self.detected_chapters:
                    self.create_toc_from_chapters()
                if self.oeb.toc.count() < opts.toc_threshold:
                    self.create_toc_from_links()
            if self.oeb.toc.count() < 2 and orig_toc.count() > 2:
                self.oeb.toc = orig_toc
            else:
                self.oeb.auto_generated_toc = True
                self.log.info('Auto generated TOC with %d entries.',
                              self.oeb.toc.count())

        if opts.toc_filter is not None:
            regexp = re.compile(opts.toc_filter)
            for node in list(self.oeb.toc.iter()):
                if not node.title or regexp.search(node.title) is not None:
                    self.log.info('Filtering %s from TOC', node.title if
                                  node.title else 'empty node')
                    self.oeb.toc.remove(node)

        if opts.page_breaks_before is not None:
            pb_xpath = XPath(opts.page_breaks_before)
            for item in oeb.spine:
                for elem in pb_xpath(item.data):
                    try:
                        prev = next(elem.itersiblings(tag=etree.Element,
                                                      preceding=True))
                        if (parse_utils.barename(elem.tag) in {'h1', 'h2'} and
                                parse_utils.barename(prev.tag) in {'h1',
                                                                   'h2'} and
                                (not prev.tail or not prev.tail.split())):
                            # We have two adjacent headings, do not put a page
                            # break on the second one
                            continue
                    except StopIteration:
                        pass

                    style = elem.get('style', '')
                    if style:
                        style += '; '
                    elem.set('style', style+'page-break-before:always')

        for node in self.oeb.toc.iter():
            if not node.title or not node.title.strip():
                node.title = 'Unnamed'

        if self.opts.start_reading_at:
            self.detect_start_reading()

    def detect_start_reading(self):
        expr = self.opts.start_reading_at
        try:
            expr = XPath(expr)
        except Exception:
            self.log.warning('Invalid start reading at XPath expression, '
                             'ignoring: %s', expr)
            return
        for item in self.oeb.spine:
            if not hasattr(item.data, 'xpath'):
                continue
            matches = expr(item.data)
            if matches:
                elem = matches[0]
                eid = elem.get('id', None)
                if not eid:
                    eid = 'start_reading_at_' + str(uuid.uuid4()).replace('-',
                                                                          '')
                    elem.set('id', eid)
                if 'text' in self.oeb.guide:
                    self.oeb.guide.remove('text')
                self.oeb.guide.add('text', 'Start', item.href+'#'+eid)
                self.log.info('Setting start reading at position to %s in %s',
                              self.opts.start_reading_at, item.href)
                return
        self.log.warning("Failed to find start reading at position: %s",
                         self.opts.start_reading_at)

    def get_toc_parts_for_xpath(self, expr):
        # if an attribute is selected by the xpath expr then truncate it
        # from the path and instead return it as where to find the title text
        title_attribute_regex = re.compile(r'/@([-\w]+)$')
        match = title_attribute_regex.search(expr)
        if match is not None:
            return expr[0:match.start()], match.group(1)

        return expr, None

    def detect_chapters(self):
        self.detected_chapters = []
        self.chapter_title_attribute = None

        def find_matches(expr, doc):
            try:
                ans = XPath(expr)(doc)
                len(ans)
                return ans
            except Exception:
                self.log.warning('Invalid chapter expression, ignoring: %s',
                                 expr)
                return []

        if self.opts.chapter:
            chapter_path, title_attribute = (
                self.get_toc_parts_for_xpath(self.opts.chapter))
            self.chapter_title_attribute = title_attribute
            for item in self.oeb.spine:
                for x in find_matches(chapter_path, item.data):
                    self.detected_chapters.append((item, x))

            chapter_mark = self.opts.chapter_mark
            page_break_before = 'display: block; page-break-before: always'
            page_break_after = 'display: block; page-break-after: always'
            c = collections.Counter()
            for item, elem in self.detected_chapters:
                c[item] += 1
                text = base.xml2text(elem).strip()
                text = re.sub(r'\s+', ' ', text.strip())
                self.log.info('\tDetected chapter: %s', text[:50])
                if chapter_mark == 'none':
                    continue
                if chapter_mark == 'rule':
                    mark = elem.makeelement(base.tag('xhtml', 'hr'))
                elif chapter_mark == 'pagebreak':
                    if c[item] < 3 and at_start(elem):
                        # For the first two elements in this item, check if
                        # they are at the start of the file, in which case
                        # inserting a page break in unnecessary and can lead
                        # to extra blank pages in the PDF Output plugin. We
                        # need to use two as feedbooks epubs match both a
                        # heading tag and its containing div with the default
                        # chapter expression.
                        continue
                    mark = elem.makeelement(base.tag('xhtml', 'div'),
                                            style=page_break_after)
                else:  # chapter_mark == 'both':
                    mark = elem.makeelement(base.tag('xhtml', 'hr'),
                                            style=page_break_before)
                try:
                    elem.addprevious(mark)
                except TypeError:
                    self.log.exception('Failed to mark chapter')

    def create_level_based_toc(self):
        if self.opts.level1_toc is not None:
            self.add_leveled_toc_items()

    def create_toc_from_chapters(self):
        counter = self.oeb.toc.next_play_order()
        for item, elem in self.detected_chapters:
            text, href = self.elem_to_link(item, elem,
                                           self.chapter_title_attribute,
                                           counter)
            self.oeb.toc.add(text, href, play_order=counter)
            counter += 1

    def create_toc_from_links(self):
        num = 0
        for item in self.oeb.spine:
            for a in XPath('//h:a[@href]')(item.data):
                href = a.get('href')
                try:
                    purl = urllib.parse.urlparse(href)
                except ValueError:
                    self.log.warning('Ignoring malformed URL: %s', href)
                    continue
                if not purl[0] or purl[0] == 'file':
                    href, frag = purl.path, purl.fragment
                    href = item.abshref(href)
                    if frag:
                        href = '#'.join((href, frag))
                    if not self.oeb.toc.has_href(href):
                        text = base.xml2text(a)
                        text = text[:100].strip()
                        if (not self.opts.duplicate_links_in_toc and
                                self.oeb.toc.has_text(text)):
                            continue
                        try:
                            self.oeb.toc.add(
                                text, href,
                                play_order=self.oeb.toc.next_play_order())
                            num += 1
                        except ValueError:
                            self.oeb.log.critical('Failed to process link: %r',
                                                  href)
                            # Most likely an incorrectly URL encoded link
                            continue
                        if self.opts.max_toc_links > 0 and \
                                num >= self.opts.max_toc_links:
                            self.log.info('Maximum TOC links reached, '
                                          'stopping.')
                            return

    def elem_to_link(self, item, elem, title_attribute, counter):
        text = ''
        if title_attribute is not None:
            text = elem.get(title_attribute, '')
        if not text:
            text = base.xml2text(elem).strip()
        if not text:
            text = elem.get('title', '')
        if not text:
            text = elem.get('alt', '')
        text = re.sub(r'\s+', ' ', text.strip())
        text = text[:1000].strip()
        id = elem.get('id', 'calibre_toc_%d' % counter)
        elem.set('id', id)
        href = '#'.join((item.href, id))
        return text, href

    def add_leveled_toc_items(self):
        added = collections.OrderedDict()
        added2 = collections.OrderedDict()
        counter = 1

        def find_matches(expr, doc):
            try:
                ans = XPath(expr)(doc)
                len(ans)
                return ans
            except Exception:
                self.log.warning('Invalid ToC expression, ignoring: %s', expr)
                return []

        for document in self.oeb.spine:
            previous_level1 = list(added.values())[-1] if added else None
            previous_level2 = list(added2.values())[-1] if added2 else None

            (level1_toc,
             level1_title) = self.get_toc_parts_for_xpath(self.opts.level1_toc)
            for elem in find_matches(level1_toc, document.data):
                text, _href = self.elem_to_link(document, elem, level1_title,
                                                counter)
                counter += 1
                if text:
                    node = self.oeb.toc.add(
                        text, _href, play_order=self.oeb.toc.next_play_order())
                    added[elem] = node
                    # node.add('Top', _href)

            if self.opts.level2_toc is not None and added:
                level2_toc, level2_title = self.get_toc_parts_for_xpath(
                    self.opts.level2_toc)
                for elem in find_matches(level2_toc, document.data):
                    level1 = None
                    for item in document.data.iterdescendants():
                        if item in added:
                            level1 = added[item]
                        elif item == elem:
                            if level1 is None:
                                if previous_level1 is None:
                                    break
                                level1 = previous_level1
                            text, _href = self.elem_to_link(document, elem,
                                                            level2_title,
                                                            counter)
                            counter += 1
                            if text:
                                added2[elem] = level1.add(
                                    text, _href,
                                    play_order=self.oeb.toc.next_play_order())
                            break

                if self.opts.level3_toc is not None and added2:
                    level3_toc, level3_title = self.get_toc_parts_for_xpath(
                        self.opts.level3_toc)
                    for elem in find_matches(level3_toc, document.data):
                        level2 = None
                        for item in document.data.iterdescendants():
                            if item in added2:
                                level2 = added2[item]
                            elif item == elem:
                                if level2 is None:
                                    if previous_level2 is None:
                                        break
                                    level2 = previous_level2
                                text, _href = self.elem_to_link(document,
                                                                elem,
                                                                level3_title,
                                                                counter)
                                counter += 1
                                if text:
                                    level2.add(text, _href,
                                               play_order=self.oeb
                                               .toc.next_play_order())
                                break
